Using 'Most Recent Data' - 141 MB CSV
In [1]:
import pandas as pd
import numpy as np
import os

import folium
In [2]:
path = '/Users/Weiyang/Downloads'
In [3]:
os.chdir(path)
In [4]:
dataset = pd.DataFrame.from_csv('dataset.csv')
/Users/Weiyang/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2802: DtypeWarning: Columns (6,9,1608,1619,1620,1621,1622,1623,1624,1625,1626,1627,1628,1629,1688,1689,1690,1691,1692,1703,1704,1725,1726,1727,1728,1729,1743,1815,1816,1817,1818,1823,1824) have mixed types. Specify dtype option on import or set low_memory=False.
  if self.run_code(code, result):

Problem: Are students who pursue Computer Science better off than Engineering students after graduation?

Selecting columns based on Data Dictionary

In [5]:
target = ['INSTNM', 'LATITUDE', 'LONGITUDE', 'ADM_RATE', 'CONTROL', 'LOCALE', 'SAT_AVG', 'PCIP11', 'PCIP14', 'TUITIONFEE_IN', 'TUITIONFEE_OUT', 'MEDIAN_HH_INC', 'UNEMP_RATE', 'GRAD_DEBT_MDN', 'COMPL_RPY_1YR_RT', 'COMPL_RPY_3YR_RT']
In [6]:
focus = dataset[target]
In [7]:
focus.shape
Out[7]:
(7593, 16)
In [8]:
focus.head()
Out[8]:
INSTNM LATITUDE LONGITUDE ADM_RATE CONTROL LOCALE SAT_AVG PCIP11 PCIP14 TUITIONFEE_IN TUITIONFEE_OUT MEDIAN_HH_INC UNEMP_RATE GRAD_DEBT_MDN COMPL_RPY_1YR_RT COMPL_RPY_3YR_RT
UNITID
100654 Alabama A & M University 34.783368 -86.568502 0.6538 1 12.0 850.0 0.0634 0.1432 9366.0 17136.0 49720.22 4.84000015258789 35000 0.317436662 0.4110512129
100663 University of Alabama at Birmingham 33.502230 -86.809170 0.6043 1 12.0 1147.0 0.0133 0.0577 7766.0 17654.0 55735.22 3.45000004768371 21500 0.573300971 0.6246013667
100690 Amridge University 32.362609 -86.174010 NaN 2 12.0 NaN 0.0000 0.0000 6900.0 6900.0 53683.7 3.59999990463256 23000 0.375 0.4576271186
100706 University of Alabama in Huntsville 34.722818 -86.638420 0.8120 1 12.0 1221.0 0.0315 0.3027 9128.0 20622.0 58688.62 3.64000010490417 23500 0.642589118 0.7098821396
100724 Alabama State University 32.364317 -86.295677 0.4639 1 12.0 844.0 0.0567 0.0000 8720.0 15656.0 46065.2 4.80999994277954 32091 0.212041885 0.3545232274

Getting rid of rows with 'Privacy Suppressed'

In [9]:
new_target = ['MEDIAN_HH_INC', 'UNEMP_RATE', 'GRAD_DEBT_MDN', 'COMPL_RPY_1YR_RT', 'COMPL_RPY_3YR_RT']
In [10]:
for i in new_target:
    
    focus = focus[focus[i] != 'PrivacySuppressed']
In [11]:
focus.shape
Out[11]:
(5381, 16)

Casting rows as numeric

In [12]:
for i in new_target:
    
    focus[i] = pd.to_numeric(focus[i])
In [13]:
focus.describe()
Out[13]:
LATITUDE LONGITUDE ADM_RATE CONTROL LOCALE SAT_AVG PCIP11 PCIP14 TUITIONFEE_IN TUITIONFEE_OUT MEDIAN_HH_INC UNEMP_RATE GRAD_DEBT_MDN COMPL_RPY_1YR_RT COMPL_RPY_3YR_RT
count 4987.000000 4987.000000 1897.000000 5381.000000 4987.000000 1213.000000 4962.000000 4962.000000 3663.000000 3453.000000 4000.000000 4000.000000 5358.000000 5066.000000 5235.000000
mean 37.694088 -90.194047 0.687423 2.182680 19.251053 1055.847486 0.035870 0.010739 15420.069342 18426.066030 59479.855670 3.739755 17400.904255 0.501398 0.542261
std 5.553862 15.928390 0.197470 0.842794 9.044168 131.617679 0.090764 0.051922 11561.576484 10691.585097 11834.376864 1.043963 8363.903118 0.194977 0.193340
min 13.432726 -159.395966 0.000000 1.000000 11.000000 708.000000 0.000000 0.000000 886.000000 1040.000000 17043.010000 1.910000 1500.000000 0.073593 0.085837
25% 34.043998 -97.053879 0.565900 1.000000 12.000000 970.000000 0.000000 0.000000 6119.500000 10616.000000 51499.752500 3.070000 9500.000000 0.328363 0.375000
50% 39.045646 -86.123924 0.708500 2.000000 21.000000 1038.000000 0.004050 0.000000 13032.000000 16328.000000 59225.360000 3.500000 16250.000000 0.488816 0.533708
75% 41.501865 -78.963899 0.831100 3.000000 21.000000 1118.000000 0.032575 0.000000 19568.000000 24170.000000 66759.580000 4.110000 25000.000000 0.651660 0.697613
max 64.856483 144.802837 1.000000 3.000000 43.000000 1507.000000 1.000000 0.872400 53000.000000 53000.000000 98890.710000 9.890000 50750.000000 0.962604 0.973118

Mapping

In [14]:
focus.isnull().sum()
Out[14]:
INSTNM                 0
LATITUDE             394
LONGITUDE            394
ADM_RATE            3484
CONTROL                0
LOCALE               394
SAT_AVG             4168
PCIP11               419
PCIP14               419
TUITIONFEE_IN       1718
TUITIONFEE_OUT      1928
MEDIAN_HH_INC       1381
UNEMP_RATE          1381
GRAD_DEBT_MDN         23
COMPL_RPY_1YR_RT     315
COMPL_RPY_3YR_RT     146
dtype: int64
In [15]:
first_map = focus[np.isfinite(focus['LONGITUDE'])]

Note: Map is centered on Dallas, Texas

In [30]:
folium_map = folium.Map(location=(32.804407, -96.629080),
                     zoom_start = 8,
                     tiles='OpenStreetMap')

for idx, row in first_map.iterrows():
    green = '#228B22'
    orange = '#FFA500'
    blue = '#0000FF'
    red = '#ff4545'
    
    ### Applying manual scaling by 30
    radius = row['PCIP11']*30
    threshold = row['GRAD_DEBT_MDN']
    
    if threshold > first_map['GRAD_DEBT_MDN'].quantile(0.25) and threshold <= first_map['GRAD_DEBT_MDN'].quantile(0.5):
        color = green
    elif threshold > first_map['GRAD_DEBT_MDN'].quantile(0.5) and threshold <= first_map['GRAD_DEBT_MDN'].quantile(0.75):
        color = orange
    elif threshold > first_map['GRAD_DEBT_MDN'].quantile(0.75):
        color = red
    else:
        color = blue
        
    folium.CircleMarker(location = (row['LATITUDE'], row['LONGITUDE']),
                        radius = radius,
                        color = color,
                        fill=True).add_to(folium_map)
In [31]:
folium_map.save('CS_Map.html')
In [32]:
from IPython.core.display import display, HTML
display(HTML('CS_Map copy.html'))
In [26]:
folium_map2 = folium.Map(location=(32.804407, -96.629080),
                     zoom_start = 8,
                     tiles='OpenStreetMap')

for idx, row in first_map.iterrows():
    green = '#228B22'
    orange = '#FFA500'
    blue = '#0000FF'
    red = '#ff4545'
    
    ### Applying manual scaling by 30
    radius = row['PCIP14']*30
    threshold = row['GRAD_DEBT_MDN']
    
    if threshold > first_map['GRAD_DEBT_MDN'].quantile(0.25) and threshold <= first_map['GRAD_DEBT_MDN'].quantile(0.5):
        color = green
    elif threshold > first_map['GRAD_DEBT_MDN'].quantile(0.5) and threshold <= first_map['GRAD_DEBT_MDN'].quantile(0.75):
        color = orange
    elif threshold > first_map['GRAD_DEBT_MDN'].quantile(0.75):
        color = red
    else:
        color = blue
        
    folium.CircleMarker(location = (row['LATITUDE'], row['LONGITUDE']),
                        radius = radius,
                        color = color,
                        fill=True).add_to(folium_map2)
In [27]:
folium_map2.save('Eng_Map.html')
In [28]:
from IPython.core.display import display, HTML
display(HTML('Eng_Map copy.html'))